library(xml2)
library(tibble)
library(readr)
library(stringr)
library(dplyr)

# Help functions

# Returns YYYY-MM-DD from Plos publication date
construct_date = function(xml) {
  day = xml |> xml_find_all('.//day') |> xml_text()
  if (as.numeric(day)<10) {
    day = paste0(0, day)
  }
  month = xml |> xml_find_all('.//month') |> xml_text()
  if (as.numeric(month)<10) {
    month = paste0(0, month)
  }
  year = xml |> xml_find_all('.//year') |> xml_text()
  return (paste(year, month, day, sep="-"))
}

# Adds all authors (as surname, givenname) into a single string, concatenated by ;
get_authors = function (xml) {
  surname = xml |> xml_find_first('.//surname') |> xml_text()
  given_names = xml |> xml_find_first('.//given-names') |> xml_text()
  authors = c(paste(surname, given_names, sep=", "))
  return(paste(authors, collapse="; "))
}


# Read in coded PLOS Qualitative Studies
plos_articles = read_csv("data/raw/plos_qualitative_studies.csv")

# Add empty columns
plos_articles[, c('data_availability', 'funding', 'abstract', 'pub_date', 'journal', 'authors', 'affiliations')] <- NA


# fix typos:
plos_articles$url[[463]] = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0267375&type=manuscript"
plos_articles$url[[642]] = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0272334&type=manuscript"
plos_articles$url[[1044]] = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0255145&type=manuscript"
plos_articles$url = plos_articles$url |> str_replace_all("=manuscriptt", "=manuscript")
# Retrieve article XML from PLOS and fill in metadata 
for( i in 1:nrow(plos_articles)) {
  print(i)
  print(plos_articles[i, ]$url)
  article = read_xml(plos_articles[i, ]$url)
  plos_articles[i, ]$data_availability = article |> xml_find_all("//custom-meta[@id='data-availability']/meta-value") |> xml_text() |> paste(collapse=" ")
  plos_articles[i, ]$funding = article |> xml_find_all("//funding-group/funding-statement") |> xml_text()|> paste(collapse=" ")
  plos_articles[i, ]$abstract = article |> xml_find_all("//abstract") |> xml_text()|> paste(collapse=" ")
  plos_articles[i, ]$pub_date = article |> xml_find_all("//pub-date[@pub-type='epub']") |> construct_date()
  plos_articles[i, ]$journal = article |> xml_find_first("//journal-title") |>xml_text()
  plos_articles[i, ]$authors = article |> xml_find_all("//contrib[@contrib-type='author']") |> get_authors()
  plos_articles[i, ]$affiliations = article |> xml_find_all("//aff[contains(@id, 'aff')]//addr-line") |> xml_text() |> paste(collapse="; ")
  Sys.sleep(1)
}

# write to CSV file
write_csv( plos_articles, "data/raw/plos_metadata.csv", row.names = FALSE)

# Create samples to establish coding schema (leaving here for transparency, but samples aren't included in reproducibility)

# full_sample = read_csv("data/raw/plos_metadata.csv")

# coding_schema_sample1 = full_sample %>%
#   sample_n(100)
# 
# coding_schema_sample2 = full_sample %>%
#   sample_n(100)
# 
# coding_schema_sample3 = full_sample %>%
#   sample_n(100)
# 
# write.csv(coding_schema_sample1, "coding_schema_sample1.csv", row.names = FALSE)
# write.csv(coding_schema_sample2, "coding_schema_sample2.csv", row.names = FALSE)
# write.csv(coding_schema_sample3, "coding_schema_sample3.csv", row.names = FALSE)
# 
